#!/usr/bin/env python
# -*- coding: utf-8 -*- 
# 
# Eclipse Luna
# python 2.7.5
# pyev             https://pypi.python.org/pypi/pyev/
# pycurl           https://pypi.python.org/pypi/pycurl
# BeautifulSoup    http://www.crummy.com/software/BeautifulSoup/#Download

import pycurl
import StringIO
import sys
import os
import logging
import logging.handlers 
import simplejson as json
import signal
import pyev #libev的python绑定，这里主要用timer的实现部分

from bs4 import BeautifulSoup

class Spider:
    def __init__(self, logger):
        self.url = None
        self.logger = logger
        self.status = None
    def cURL(self, url):
        try:
            b = StringIO.StringIO()
            c = pycurl.Curl()
            
            c.setopt(pycurl.URL, url)
            c.setopt(pycurl.HTTPHEADER, ["Accept:"])
            c.setopt(pycurl.WRITEFUNCTION, b.write) 
            c.setopt(pycurl.FOLLOWLOCATION, 1)
            c.setopt(pycurl.MAXREDIRS, 5)
            c.perform() 
            self.status = c.getinfo(c.HTTP_CODE)
            #print self.status
            self.logger.info(self.status)
            return b.getvalue()

        except pycurl.error, e:
            self.logger.error("can't connect")
            c.close()
            b.close()
            sys.exit(255)
            
    def Get(self, type, url):
        if type is 1:
            self.GetRating_Sina(url)
    
    # tencent        
    def GetCiHolder(self, stockNO):
        #http://stock.finance.qq.com/corp1/stk_ciholder.php?zqdm=002432
        url = "http://stock.finance.qq.com/corp1/stk_ciholder.php?zqdm="+stockNO
        html = self.cURL(url)
        soup = BeautifulSoup(html)
        tables = soup.findAll('table')
        t_count = 0
        json_data = {}
        for table in tables:
            t_count = t_count + 1
            
            #前两个table不关注
            #if t_count<3:
            #    continue
            # 第二个表获取后，tr6出现了数据不完整现象，因此只获取第一个表
            if t_count == 3:
                json_table = {}
                trs = table.findAll('tr')
                lens = trs.__len__
                count = 0
                json_trs = {}
                for tr in trs:
                    count = count + 1
                    if count == 1:
                        # 第一行，获取报告日期，发布日期 <span class="fntTahoma">2014-09-30</span>
                        spans = tr.findAll('span', attrs = {'class': ['fntTahoma']})
                        report_date = spans[0].text
                        notice_date = spans[1].text
                        json_table["report_date"] = report_date
                        json_table["notice_date"] = notice_date
                        
                    elif count == 2:
                        # header
                        continue
                    else:
                        # 第三行开始获取正文
                        json_tr = {}
                        tds = tr.findAll('td')
                        # 首先判断是否为最后一行
                        if tds[0].text == u"合计":
                            json_table["tatal"] = tds[2].text
                            json_table["proportion"] = tds[4].text
                        else:
                            # 第二个表获取后，tr6出现了数据不完整现象，因此只获取第一个表
                            #if count-2 == 6:
                                #print count
                            json_tr[0] = tds[0].text
                            json_tr[1] = tds[1].text
                            json_tr[2] = tds[2].text
                            json_tr[3] = tds[3].text
                            json_tr[4] = tds[4].text
                            json_tr[5] = tds[5].text
                            json_tr[6] = tds[6].text  # 注意：header有6个字段，但是内容确有7个字段，最后一个字段包含增/减持数量
                            json_trs[count-2] = json_tr
                            #print tds
                json_table["data"] = json_trs
                
                self.logger.info(url+ " json_table[\"tatal\"]:"+ json_table["tatal"])
                print json_table
            else:
                continue
    # tencent     
    def GetHolder(self, stockNO):
        #http://stock.finance.qq.com/corp1/stk_holder.php?zqdm=002432
        url = "http://stock.finance.qq.com/corp1/stk_holder.php?zqdm="+stockNO
        html = self.cURL(url)
        soup = BeautifulSoup(html)
        tables = soup.findAll('table')
        t_count = 0
        json_data = {}
        for table in tables:
            t_count = t_count + 1
            
            #前两个table不关注
            #if t_count<3:
            #    continue
            # 第二个表获取后，tr6出现了数据不完整现象，因此只获取第一个表
            if t_count == 3:
                json_table = {}
                trs = table.findAll('tr')
                lens = trs.__len__
                count = 0
                json_trs = {}
                for tr in trs:
                    count = count + 1
                    if count == 1:
                        # 第一行，获取报告日期，发布日期 <span class="fntTahoma">2014-09-30</span>
                        spans = tr.findAll('span', attrs = {'class': ['fntTahoma']})
                        report_date = spans[0].text
                        notice_date = spans[1].text
                        json_table["report_date"] = report_date
                        json_table["notice_date"] = notice_date
                        
                    elif count == 2:
                        # header
                        continue
                    else:
                        # 第三行开始获取正文
                        json_tr = {}
                        tds = tr.findAll('td')
                        # 首先判断是否为最后一行
                        if tds[0].text == u"合计":
                            json_table["tatal"] = tds[2].text
                            json_table["proportion"] = tds[4].text
                        else:
                            # 第二个表获取后，tr6出现了数据不完整现象，因此只获取第一个表
                            #if count-2 == 6:
                                #print count
                            json_tr[0] = tds[0].text
                            json_tr[1] = tds[1].text
                            json_tr[2] = tds[2].text
                            json_tr[3] = tds[3].text
                            json_tr[4] = tds[4].text
                            json_tr[5] = tds[5].text
                            json_tr[6] = tds[6].text  # 注意：header有6个字段，但是内容确有7个字段，最后一个字段包含增/减持数量
                            json_trs[count-2] = json_tr
                            #print tds
                json_table["data"] = json_trs
                
                self.logger.info(url+ " json_table[\"tatal\"]:"+ json_table["tatal"])
                print json_table
            else:
                continue
        
    # tencent     
    def GetZJC_Tencent(self, market, stockNO):
        # http://stock.finance.qq.com/hk/hklist/view/rights_main_holder.php?c=00699&b=00000000&max=50
        market = "hk"
        if market == "hk":
            url = "http://stock.finance.qq.com/hk/hklist/view/rights_main_holder.php?c="+stockNO+"&b=00000000&max=50"
        
        html = self.cURL(url)
        soup = BeautifulSoup(html)
        #header [序号    机构    变动方向    变动股份数    变动后数量    变动后持股率    公布时间]
        #content [1 WP Global LLC 增持 430,471,340 430,471,340 18.77% 2014-09-19]
        tables = soup.find('table', attrs = {'class': ['new-table']})
        #print tables
        trs = tables.findAll('tr')
        count = 0
        json_data = {}
        for tr in trs:
            count = count + 1
            if count >3:
                #print tr.text
                ths = tr.findAll('th')
                tds = tr.findAll('td')
                tr_dict_data = {}
                tr_dict_data[0] = ths[0].text
                tr_dict_data[1] = ths[1].text
                tr_dict_data[2] = ths[2].text
                tr_dict_data[3] = tds[0].text   # !!! tds
                tr_dict_data[4] = tds[1].text   # !!! tds 
                tr_dict_data[5] = tds[2].text   # !!! tds
                tr_dict_data[6] = tds[3].text   # !!! tds
                json_data[count-4] = tr_dict_data
                
        self.logger.info(url)
        print json.dumps(json_data)
    
    # Sina     
    def GetMaxCount_Sina(self, url):
        # list_info
        html = self.cURL(url)
        
        soup = BeautifulSoup(html)
        dl = soup.find('dl', attrs = {'class': ['list_info']})
        #print dl.text
        # 共13337页，共266733条评级
        content = dl.text
        l = content.split(u'，')  # 注意是  全角符号
        pages = filter(lambda x:x.isdigit(), l[0])
        ratings = filter(lambda x:x.isdigit(), l[1])
        
        self.logger.info(url+ " pages:"+ pages +" ratings:" + ratings)    
        return (pages, ratings)

    # Sina        
    def GetRating_Sina(self, url, pages):
        html = self.cURL(url+"?p="+pages)
        soup = BeautifulSoup(html)
        rating_table = soup.find('table')
        trs = rating_table.findAll('tr')
        count = 0
        #[head] 股票名称 股票代码 投行名称 最新评级 最新价 目标价 目标价变化 评级时间 研究报告 近期研报
        #[content] 昆仑能源 00135 麦格理 买入 -- 14.50 下降 2014-12-02  查看 查看 
        json_data = {}
        #dict_data = {}
        json_item = {}
        
        for tr in trs:
            count = count + 1
            if count == 1:
                #header 转化为json后，此处数据可以不体现
                #header = []
                #tds = tr.findAll('td')
                #for td in tds:
                #    header.append(td.text)
                #json_data.append(header)
                continue
            else:
                ths = tr.findAll('th') #Sina finance
                tds = tr.findAll('td') #Sina finance
                for th in ths:
                    tr_dict_data = {}
                    tr_dict_data[0] = ths[0].text
                    tr_dict_data[1] = ths[1].text
                    tr_dict_data[2] = ths[2].text
                    tr_dict_data[3] = ths[3].text
                    tr_dict_data[4] = tds[0].text  # !!! tds 
                    tr_dict_data[5] = tds[1].text  # !!! tds
                    tr_dict_data[6] = ths[4].text
                    tr_dict_data[7] = ths[5].text
                    tr_dict_data[8] = ths[6].text
                    tr_dict_data[9] = ths[7].text
                    #json_data[count-1] = tr_dict_data
                json_item[count-2] = tr_dict_data
        json_data["content"] = json_item
        json_data["rows"] = count - 2
        
        self.logger.info(url+ " pages:"+ pages +" rows:" + str(count-2))
        return json.dumps(json_data);        

def test():
    LOG_FILE = 'spider.log'  
  
    handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes = 1024*1024, backupCount = 5) # 实例化handler   
    fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'  
  
    formatter = logging.Formatter(fmt)   # 实例化formatter  
    handler.setFormatter(formatter)      # 为handler添加formatter  
  
    logger = logging.getLogger('spider')    # 获取名为tst的logger  
    logger.addHandler(handler)           # 为logger添加handler  
    #Logger.setLevel(lvl)
    #设置logger的level， level有以下几个级别：
    #级别高低顺序：NOTSET < DEBUG < INFO < WARNING < ERROR < CRITICAL
    #如果把looger的级别设置为INFO， 那么小于INFO级别的日志都不输出， 大于等于INFO级别的日志都输出
    # logger.debug(""), warning(""), error(""), critical("")
    logger.setLevel(logging.DEBUG)
      
    #logger.info('first info message')  
    #logger.debug('first debug message')    
    #logger = logging
    #logger.basicConfig(filename = os.path.join(os.getcwd(), 'log.txt'), #level = logging.DEBUG, 
    #                             filemode = 'w', format = '%(asctime)s - %(levelname)s: %(message)s')  
    s = Spider(logger)
    pages, ratings = s.GetMaxCount_Sina("http://money.finance.sina.com.cn/hk/rating.php")
    content = s.GetRating_Sina("http://money.finance.sina.com.cn/hk/rating.php", "2")
    #print pages, ratings, content
    
    s.GetZJC_Tencent("hk", "00699")
    s.GetCiHolder("002432")
    s.GetHolder("002432")
    
    #print 'hello'

# 基于libev, pyev的timer实现
def sig_cb(watcher, revents):
    print("sig callback ...")
    loop = watcher.loop
    if loop.data:
        while loop.data:
            loop.data.pop().stop()
    loop.stop(pyev.EVBREAK_ALL)
    
# 基于libev, pyev的timer实现
def timer_cb(watcher, revents):
    print("callback ...")
    watcher.data += 1
    # 调用执行函数
    test()
        
if __name__=="__main__":
    loop = pyev.default_loop()
    timer = loop.timer(0, 60, timer_cb, 0)
    timer.start()
    sig = loop.signal(signal.SIGINT, sig_cb)
    sig.start()
    loop.data = [timer, sig]
    loop.start()
    
    
    